{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e7e97dfe-a53f-4b6c-bfe6-96d9cc98e1cd",
   "metadata": {},
   "source": [
    "#  Import modules and connect to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c2ad8cb1-1b47-4fd6-8454-7e6e034ff5b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'h2QwONxsT4Kt-lTRKmPrhg', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from getpass import getpass\n",
    "from elasticsearch import Elasticsearch, helpers\n",
    "import wget, zipfile, pandas as pd, json, openai\n",
    "import streamlit as st\n",
    "from tqdm.notebook import tqdm\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(client.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2065018b-1db5-48c1-9f89-3464b98f807e",
   "metadata": {},
   "source": [
    "# Configure OpenAI connection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "05acda13-7086-4476-bb0e-ee868b52ce8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "openai = OpenAI()\n",
    "openai.models.retrieve(\"text-embedding-ada-002\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ce15dc5-75a6-47b5-9b18-52c873d4aec7",
   "metadata": {},
   "source": [
    "# Download the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b08aff11-cfb1-4382-9a5a-ed4f0a476bbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\n",
    "\"r\") as zip_ref:\n",
    "    zip_ref.extractall(\"data\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49424e92-b5b8-4919-861c-6fa597c6e396",
   "metadata": {},
   "source": [
    "# Read CSV file into a Pandas DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7cb7b01b-f39a-4887-812b-06d8037c67ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "wikipedia_dataframe = pd.read_csv(\"data/vector_database_wikipedia_articles_embedded.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ed584af-60d0-4d64-8fb6-e1476dbb48b1",
   "metadata": {},
   "source": [
    "# Create index with mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8dce7e96-150e-4435-a5ee-93f96b9c7a08",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'wikipedia_vector_index'})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_mapping= {\n",
    "    \"properties\": {\n",
    "      \"title_vector\": {\n",
    "          \"type\": \"dense_vector\",\n",
    "          \"dims\": 1536,\n",
    "          \"index\": \"true\",\n",
    "          \"similarity\": \"cosine\"\n",
    "      },\n",
    "      \"content_vector\": {\n",
    "          \"type\": \"dense_vector\",\n",
    "          \"dims\": 1536,\n",
    "          \"index\": \"true\",\n",
    "          \"similarity\": \"cosine\"\n",
    "      },\n",
    "      \"text\": {\"type\": \"text\"},\n",
    "      \"title\": {\"type\": \"text\"},\n",
    "      \"url\": { \"type\": \"keyword\"},\n",
    "      \"vector_id\": {\"type\": \"long\"}\n",
    "\n",
    "    }\n",
    "}\n",
    "client.indices.create(index=\"wikipedia_vector_index\", mappings=index_mapping)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4717a495-253b-4fae-abfd-c5574cbf10f2",
   "metadata": {},
   "source": [
    "# Index data into Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4e46f1c0-b10e-4c5d-bc0e-db3aa47be2af",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dataframe_to_bulk_actions(df):\n",
    "    for index, row in df.iterrows():\n",
    "        yield {\n",
    "            \"_index\": 'wikipedia_vector_index',\n",
    "            \"_id\": row['id'],\n",
    "            \"_source\": {\n",
    "                'url' : row[\"url\"],\n",
    "                'title' : row[\"title\"],\n",
    "                'text' : row[\"text\"],\n",
    "                'title_vector' : json.loads(row[\"title_vector\"]),\n",
    "                'content_vector' : json.loads(row[\"content_vector\"]),\n",
    "                'vector_id' : row[\"vector_id\"]\n",
    "            }\n",
    "        }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0e112f3c-14bc-4ada-9a45-c9c0fdd6c109",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "37e636275acf47e3953c73a3d8097996",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/25000 [00:00<?, ?documents/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "total_documents = len(wikipedia_dataframe)\n",
    "\n",
    "progress_bar = tqdm(total=total_documents, unit=\"documents\")\n",
    "success_count = 0\n",
    "\n",
    "for ok, info in helpers.streaming_bulk(client, actions=dataframe_to_bulk_actions(wikipedia_dataframe), raise_on_error=False, chunk_size=100):\n",
    "  if ok:\n",
    "    success_count += 1\n",
    "  else:\n",
    "    print(f\"Unable to index {info['index']['_id']}: {info['index']['error']}\")\n",
    "  progress_bar.update(1)\n",
    "  progress_bar.set_postfix(success=success_count)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c012595-727c-48f2-a676-6bff6e9ebe61",
   "metadata": {},
   "source": [
    "# Build application with Streamlit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "489ab74e-5842-4329-b401-bc4d419b5b8f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[K\u001b[?25hm##################\u001b[0m) ⠧ reify:localtunnel: \u001b[32;40mhttp\u001b[0m \u001b[35mfetch\u001b[0m GET 200 https://registry.n\u001b[0m\u001b[K/regis\u001b[0m\u001b[K\n",
      "added 22 packages, and audited 23 packages in 15s\n",
      "\n",
      "3 packages are looking for funding\n",
      "  run `npm fund` for details\n",
      "\n",
      "2 \u001b[33m\u001b[1mmoderate\u001b[22m\u001b[39m severity vulnerabilities\n",
      "\n",
      "To address all issues, run:\n",
      "  npm audit fix\n",
      "\n",
      "Run `npm audit` for details.\n",
      "\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[36;40mnotice\u001b[0m\u001b[35m\u001b[0m \n",
      "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[36;40mnotice\u001b[0m\u001b[35m\u001b[0m New \u001b[31mmajor\u001b[39m version of npm available! \u001b[31m8.19.2\u001b[39m -> \u001b[32m10.4.0\u001b[39m\n",
      "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[36;40mnotice\u001b[0m\u001b[35m\u001b[0m Changelog: \u001b[36mhttps://github.com/npm/cli/releases/tag/v10.4.0\u001b[39m\n",
      "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[36;40mnotice\u001b[0m\u001b[35m\u001b[0m Run \u001b[32mnpm install -g npm@10.4.0\u001b[39m to update!\n",
      "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[36;40mnotice\u001b[0m\u001b[35m\u001b[0m \n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!npm install localtunnel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "0da4a991-127b-45d4-b16a-cfae5fb07f10",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting app.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile app.py\n",
    "\n",
    "import os\n",
    "import streamlit as st\n",
    "import openai\n",
    "from elasticsearch import Elasticsearch\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "from openai import OpenAI\n",
    "\n",
    "openai = OpenAI()\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "openai_api_key=os.getenv('OPENAI_API_KEY')\n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    "\n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "client = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    "\n",
    "# Define model\n",
    "EMBEDDING_MODEL = \"text-embedding-ada-002\"\n",
    "\n",
    "\n",
    "def openai_summarize(query, response):\n",
    "    context = response['hits']['hits'][0]['_source']['text']\n",
    "    summary = openai.chat.completions.create(\n",
    "    model=\"gpt-3.5-turbo\",\n",
    "    messages=[\n",
    "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
    "            {\"role\": \"user\", \"content\": \"Answer the following question:\" + query + \"by using the following text: \" + context},\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    print(summary)\n",
    "    return summary.choices[0].message.content\n",
    "\n",
    "\n",
    "def search_es(query):\n",
    "    # Create embedding\n",
    "    question_embedding = openai.embeddings.create(input=query, model=EMBEDDING_MODEL)\n",
    "\n",
    "    # Define Elasticsearch query\n",
    "    response = client.search(\n",
    "    index = \"wikipedia_vector_index\",\n",
    "    knn={\n",
    "        \"field\": \"content_vector\",\n",
    "        \"query_vector\":  question_embedding.data[0].embedding,\n",
    "        \"k\": 10,\n",
    "        \"num_candidates\": 100\n",
    "        }\n",
    "    )\n",
    "    return response\n",
    "\n",
    "\n",
    "def main():\n",
    "    st.title(\"Gen AI Application\")\n",
    "\n",
    "    # Input for user search query\n",
    "    user_query = st.text_input(\"Enter your question:\")\n",
    "\n",
    "    if st.button(\"Search\"):\n",
    "        if user_query:\n",
    "\n",
    "            st.write(f\"Searching for: {user_query}\")\n",
    "            result = search_es(user_query)\n",
    "\n",
    "            # print(result)\n",
    "            openai_summary = openai_summarize(user_query, result)\n",
    "            st.write(f\"OpenAI Summary: {openai_summary}\")\n",
    "\n",
    "            # Display search results\n",
    "            if result['hits']['total']['value'] > 0:\n",
    "                st.write(\"Search Results:\")\n",
    "                for hit in result['hits']['hits']:\n",
    "                    st.write(hit['_source']['title'])\n",
    "                    st.write(hit['_source']['text'])\n",
    "            else:\n",
    "                st.write(\"No results found.\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df034dc0-dbce-449e-a9eb-2eedcafeb7b7",
   "metadata": {},
   "source": [
    "# Run the application"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc4c16f9-0d5b-447c-a6fa-ca7811fafa08",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0m\n",
      "\u001b[34m\u001b[1m  You can now view your Streamlit app in your browser.\u001b[0m\n",
      "\u001b[0m\n",
      "\u001b[34m  Local URL: \u001b[0m\u001b[1mhttp://localhost:8502\u001b[0m\n",
      "\u001b[34m  Network URL: \u001b[0m\u001b[1mhttp://198.18.0.17:8502\u001b[0m\n",
      "\u001b[0m\n",
      "\u001b[34m\u001b[1m  For better performance, install the Watchdog module:\u001b[0m\n",
      "\n",
      "  $ xcode-select --install\n",
      "  $ pip install watchdog\n",
      "            \u001b[0m\n",
      "ChatCompletion(id='chatcmpl-8qJmP31VcFxCq0E2XBh5e4y3lDVpE', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Football is a popular sport that can refer to several different games. The most well-known type of football is association football, which is commonly known as soccer. In North America, South Africa, and Australia, it is called soccer to avoid confusion with other types of football played in those regions.\\n\\nThe name football comes from the combination of the words \"foot\" and \"ball.\" The game is called football because the players primarily use their feet to kick, but it can also involve other parts of the body depending on the specific game. The objective is to move a ball, typically shaped like a sphere or an ellipsoid, into the opponent\\'s goal. Players can use any part of their body except their arms and hands, while goalies can use any part of their body to prevent the ball from entering the goal.\\n\\nFootball has a long history and has been seen in various forms in different countries dating back to ancient times. The modern versions of rugby football and association football emerged in 19th-century England. Association football spread across Europe and eventually globally through colonization. Today, it is the most popular sport worldwide, with millions of people watching events like the FIFA World Cup every four years.\\n\\nIn addition to association football, there are other types of football played in different countries, including American football, rugby league football, rugby union football, gridiron football, college football, touch rugby, Canadian football, Gaelic football, and Australian Rules Football.\\n\\nReferences:\\n- Britannica: Football\\n- FIFA: The Official Laws of the Game\\n- UEFA: Champions League', role='assistant', function_call=None, tool_calls=None))], created=1707479781, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=311, prompt_tokens=523, total_tokens=834))\n"
     ]
    }
   ],
   "source": [
    "!streamlit run app.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4929e1ef-a5f3-41b5-aa77-01283a69863c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
